import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
pip install yellowbrick
Defaulting to user installation because normal site-packages is not writeable
Collecting yellowbrick
Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
------------------------------------ 282.6/282.6 kB 562.7 kB/s eta 0:00:00
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\programdata\anaconda3\lib\site-packages (from yellowbrick) (3.5.1)
Requirement already satisfied: scipy>=1.0.0 in c:\programdata\anaconda3\lib\site-packages (from yellowbrick) (1.7.3)
Requirement already satisfied: scikit-learn>=1.0.0 in c:\programdata\anaconda3\lib\site-packages (from yellowbrick) (1.0.2)
Requirement already satisfied: numpy>=1.16.0 in c:\users\hp\appdata\roaming\python\python39\site-packages (from yellowbrick) (1.22.4)
Requirement already satisfied: cycler>=0.10.0 in c:\programdata\anaconda3\lib\site-packages (from yellowbrick) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.3.2)
Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (21.3)
Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.0.1)
Requirement already satisfied: pyparsing>=2.2.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.4)
Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: joblib>=0.11 in c:\users\hp\appdata\roaming\python\python39\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\hp\appdata\roaming\python\python39\site-packages (from scikit-learn>=1.0.0->yellowbrick) (3.1.0)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.5
Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 23.1.2 -> 23.2.1 [notice] To update, run: python.exe -m pip install --upgrade pip
data=pd.read_csv("Mall_Customers.csv")
data
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
| ... | ... | ... | ... | ... | ... |
| 195 | 196 | Female | 35 | 120 | 79 |
| 196 | 197 | Female | 45 | 126 | 28 |
| 197 | 198 | Male | 32 | 126 | 74 |
| 198 | 199 | Male | 32 | 137 | 18 |
| 199 | 200 | Male | 30 | 137 | 83 |
200 rows × 5 columns
data.describe
<bound method NDFrame.describe of CustomerID Gender Age Annual Income (k$) Spending Score (1-100) 0 1 Male 19 15 39 1 2 Male 21 15 81 2 3 Female 20 16 6 3 4 Female 23 16 77 4 5 Female 31 17 40 .. ... ... ... ... ... 195 196 Female 35 120 79 196 197 Female 45 126 28 197 198 Male 32 126 74 198 199 Male 32 137 18 199 200 Male 30 137 83 [200 rows x 5 columns]>
data.shape
(200, 5)
data.info
<bound method DataFrame.info of CustomerID Gender Age Annual Income (k$) Spending Score (1-100) 0 1 Male 19 15 39 1 2 Male 21 15 81 2 3 Female 20 16 6 3 4 Female 23 16 77 4 5 Female 31 17 40 .. ... ... ... ... ... 195 196 Female 35 120 79 196 197 Female 45 126 28 197 198 Male 32 126 74 198 199 Male 32 137 18 199 200 Male 30 137 83 [200 rows x 5 columns]>
data.isnull().sum()
CustomerID 0 Gender 0 Age 0 Annual Income (k$) 0 Spending Score (1-100) 0 dtype: int64
data.columns
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
'Spending Score (1-100)'],
dtype='object')
data.dtypes
CustomerID int64 Gender object Age int64 Annual Income (k$) int64 Spending Score (1-100) int64 dtype: object
##Exploration Data Analysis
m_age = data[data['Gender']=='Male']['Age'] # subset with males age
f_age = data[data['Gender']=='Female']['Age'] # subset with females age
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (16,5))
plt.subplot(2,2,1)
palette_color = ['#BAD7E9', '#EB455F']
sizes = [m_age.count(), f_age.count()]
gender = ['Male', 'Female']
plt.pie(sizes, labels=gender, colors=palette_color, explode=(0, 0.05), autopct='%.0f%%')
plt.title('Gender Division')
plt.subplot(2,1,2)
ax = sns.countplot(x = 'Gender' , data = data, palette=palette_color)
for label in ax.containers:
ax.bar_label(label)
plt.title('Gender Count')
plt.show()
age_bins = range(17,77,7)
# males histogram
fig2, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5), sharey=True)
sns.distplot(m_age, bins=age_bins, kde=False, color='#BAD7E9', ax=ax1, hist_kws=dict(edgecolor="k", linewidth=2))
ax1.set_xticks(age_bins)
ax1.set_ylim(top=25)
ax1.set_title('Males')
ax1.set_ylabel('Count')
ax1.text(45,23, "TOTAL count: {}".format(m_age.count()))
ax1.text(45,22, "Mean age: {:.1f}".format(m_age.mean()))
# females histogram
sns.distplot(f_age, bins=age_bins, kde=False, color='#EB455F', ax=ax2, hist_kws=dict(edgecolor="k", linewidth=2))
ax2.set_xticks(age_bins)
ax2.set_title('Females')
ax2.set_ylabel('Count')
ax2.text(45,23, "TOTAL count: {}".format(f_age.count()))
ax2.text(45,22, "Mean age: {:.1f}".format(f_age.mean()))
plt.show()
##Ploting relations between Age , Annual Income and Spending Score
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
sns.distplot(data[x] , bins = 20)
plt.title('Distplot of {}'.format(x))
plt.show()
##Plot Distribution using violin and Boxplot
plt.figure(1 , figsize = (14 , 7))
n = 0
for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.boxplot(x = cols,y = 'Gender', data = data, palette=palette_color)
plt.title('Boxplots' if n == 2 else '')
plt.show()
##Plot Distribution column Age, Annual Income, and Spending Score relate each other.¶
plt.figure(1 , figsize = (15 , 10))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(3 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.regplot(x = x , y = y , data = data)
plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y )
plt.show()
plt.figure(1 , figsize = (15 , 7))
n = 0
for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.violinplot(x = cols , y = 'Gender' , data = data , palette = palette_color)
sns.swarmplot(x = cols , y = 'Gender' , data = data)
plt.ylabel('Gender' if n == 1 else '')
plt.title('Boxplots & Swarmplots' if n == 2 else '')
plt.show()
plt.figure(1 , figsize = (15 , 6))
for gender in ['Male' , 'Female']:
plt.scatter(x = 'Age' , y = 'Annual Income (k$)' , data = data[data['Gender'] == gender] ,
s = 200 , alpha = 0.5 , label = gender)
plt.xlabel('Age'), plt.ylabel('Annual Income (k$)')
plt.title('Age vs Annual Income w.r.t Gender')
plt.legend()
plt.show()
plt.figure(1 , figsize = (15 , 6))
for gender in ['Male' , 'Female']:
plt.scatter(x = 'Annual Income (k$)',y = 'Spending Score (1-100)' ,
data = data[data['Gender'] == gender] ,s = 200 , alpha = 0.5 , label = gender)
plt.xlabel('Annual Income (k$)'), plt.ylabel('Spending Score (1-100)')
plt.title('Annual Income vs Spending Score w.r.t Gender')
plt.legend()
plt.show()
plt.figure(1 , figsize = (15 , 6))
for gender in ['Male' , 'Female']:
plt.scatter(x = 'Age' , y = 'Annual Income (k$)' , data = data[data['Gender'] == gender] ,
s = 200 , alpha = 0.5 , label = gender)
plt.xlabel('Age'), plt.ylabel('Annual Income (k$)')
plt.title('Age vs Annual Income w.r.t Gender')
plt.legend()
plt.show()
## Correlation Matrix
fig, ax = plt.subplots(figsize=(14, 6))
data1 = data.copy()
data1['Gender'].replace(['Male', 'Female'], [0, 1], inplace=True)
color = sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True)
sns.heatmap(data1.corr(), cmap=color, square=True, annot=True)
plt.show()
#Selecting columns for clusterisation with k-means
selected_cols = ["Spending Score (1-100)", "Annual Income (k$)", "Age"]
cluster_data = data.loc[:,selected_cols]
## Data Scaling
scaler = StandardScaler()
cluster_scaled = scaler.fit_transform(cluster_data)
from sklearn.cluster import KMeans
wcss=[]
for i in range(1, 11):
kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
kmeans.fit(cluster_data)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11), wcss)
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show()
from yellowbrick.cluster import KElbowVisualizer
!pip install yellobrick
model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(2, 10))
visualizer.fit(cluster_data)
visualizer.show()
plt.show()
ERROR: Could not find a version that satisfies the requirement yellobrick (from versions: none) ERROR: No matching distribution found for yellobrick [notice] A new release of pip is available: 23.1.2 -> 23.2.1 [notice] To update, run: python.exe -m pip install --upgrade pip
Defaulting to user installation because normal site-packages is not writeable
KM_5_clusters = KMeans(n_clusters=5, init='k-means++').fit(cluster_data) # initialise and fit K-Means model
KM5_clustered = cluster_data.copy()
KM5_clustered.loc[:,'Cluster'] = KM_5_clusters.labels_
fig1, (axes) = plt.subplots(1,2,figsize=(12,5))
scat_1 = sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=KM5_clustered, hue='Cluster',
ax=axes[0], palette='Set1', legend='full')
sns.scatterplot(x='Age', y='Spending Score (1-100)', data=KM5_clustered, hue = 'Cluster',
palette='Set1', ax=axes[1], legend='full')
axes[0].scatter(KM_5_clusters.cluster_centers_[:,1],KM_5_clusters.cluster_centers_[:,2], marker='s', s=40, c="blue")
axes[1].scatter(KM_5_clusters.cluster_centers_[:,0],KM_5_clusters.cluster_centers_[:,2], marker='s', s=40, c="blue")
plt.show()
KM_clust_sizes = KM5_clustered.groupby('Cluster').size().to_frame()
KM_clust_sizes.columns = ["KM_size"]
KM_clust_sizes
| KM_size | |
|---|---|
| Cluster | |
| 0 | 39 |
| 1 | 79 |
| 2 | 23 |
| 3 | 37 |
| 4 | 22 |
import plotly as py
import plotly.graph_objs as go
def tracer(db, n, name):
'''
This function returns trace object for Plotly
'''
return go.Scatter3d(
x = db[db['Cluster']==n]['Age'],
y = db[db['Cluster']==n]['Spending Score (1-100)'],
z = db[db['Cluster']==n]['Annual Income (k$)'],
mode = 'markers',
name = name,
marker = dict(
size = 5
)
)
trace0 = tracer(KM5_clustered, 0, 'Cluster 0')
trace1 = tracer(KM5_clustered, 1, 'Cluster 1')
trace2 = tracer(KM5_clustered, 2, 'Cluster 2')
trace3 = tracer(KM5_clustered, 3, 'Cluster 3')
trace4 = tracer(KM5_clustered, 4, 'Cluster 4')
data = [trace0, trace1, trace2, trace3, trace4]
layout = go.Layout(
title = 'Clusters by K-Means',
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Spending Score'),
zaxis = dict(title = 'Annual Income')
)
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)